package cn.tyoui.index;

import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.txt.TXTParser;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;

/**
 * 文件内容
 *
 * @author Tyoui
 */
public class FileContent {

    /**
     * 读取文件内容
     *
     * @param filePath 文件流
     * @param parser   文件类型解释对象
     * @return 文件内容
     * @throws Exception 读取异常
     */
    private static String wordExport(File filePath, Parser parser) throws Exception {
        Metadata metadata = new Metadata();
        ContentHandler contentHandler = new BodyContentHandler(1024 * 1024 * 100);
        ParseContext parseContext = new ParseContext();
        metadata.set(Metadata.RESOURCE_NAME_KEY, filePath.getName());
        parseContext.set(Parser.class, parser);
        InputStream inputStream = new FileInputStream(filePath);
        parser.parse(inputStream, contentHandler, metadata, parseContext);
        if (inputStream != null)
            inputStream.close();
        return contentHandler.toString();
    }

    /**
     * 初始化文件
     *
     * @param file 文件类
     * @return 文件内容
     */
    public static String init(File file) {
        String name = file.getName();
        String suffix = name.substring(name.lastIndexOf(".") + 1);
        String text = "txtdocxpdfxlsx";
        String context = "";
        if (text.contains(suffix)) {
            try {
                Parser parser = null;
                if (suffix.equalsIgnoreCase("txt")) {
                    parser = new TXTParser();
                } else {
                    parser = new AutoDetectParser();
                }
                context = wordExport(file, parser);
                context = context.replaceAll("[<>{}|/%$#@~·`()?？_&（）+【】\t\n\r0-9A-Za-z,.!;，。；！“”:：、 -]", "");
            } catch (Exception e) {
            }
        }
        return context;
    }
}
